Data available at: https://thl.fi/publications/monica/monograph_cd/formats/survey.htm
The file used here is the 20% sample of the survey core data found here: https://www.thl.fi/publications/monica/monograph_cd/data/form04_1.zip
Data dictionary is here: https://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
monicaform048 <- read_fwf("form04_3.zip", fwf_cols(form = c(1,2), versn=c(3,3), centre=c(4,5), runit=c(6,7), serial=c(8,13), numsur=c(14,14), samunit=c(15,17), dexam=c(18,25), mbirth=c(26,33), agegrp=c(34,34), sex=c(35,35), marit=c(36,36), edlevel=c(37,37), school=c(38,39), cigs=c(40,40), numcigs=c(41,43), daycigs=c(44,44), evercig=c(45,45), stop=c(46,49), iflyear=c(50,50), maxcigs=c(51.53), cigage=c(54,55), cigarsm=c(56,56), cigar=c(57,59), pipesm=c(60,60), pipe=c(61,63), othersm=c(64,65), hibp=c(66,66), drugs=c(67,67), bprecd=c(68,68), high=c(69,69), chdt=c(70,70), chrx=c(71,71), chrecd=c(72,72), asp=c(73,73), menop=c(74,74), agem=c(75,76), horm=c(77,77), pill=c(78,78), syst1=c(79,81), diast1=c(82,84), rz1=c(85,86), syst2=c(87,89), diast2=c(90,92), rz2=c(93,94), cuff=c(95,95), arm=c(96,97), bpcoder=c(98,99), timebp=c(100,103), rtemp=c(104,105), chol=c(106,108), choldl=c(109,111), dchol=c(112,119), hdl=c(120,122), hdldl=c(123,125), dhdl=c(126,133), scn=c(134,136), cotin=c(137,140), carbmon=c(141,142), height=c(143,145), weight=c(146,149), waist=c(150,153), hip=c(154,157), whcoder=c(158,159), oversion=c(160,160), eage=c(161,162), eageg=c(163,163), cohort1=c(164,164), cohort2=c(165,165), edtert1=c(166,166), edtert2=c(167,167), systm=c(168,171), diastm=c(172,175), chola=c(176,179), hdla=c(180,183)))
## Parsed with column specification:
## cols(
## .default = col_double(),
## form = col_character(),
## runit = col_character(),
## serial = col_character(),
## dexam = col_character(),
## school = col_character(),
## numcigs = col_character(),
## cigage = col_character(),
## cigar = col_character(),
## pipe = col_character(),
## othersm = col_character(),
## syst1 = col_character(),
## diast1 = col_character(),
## rz1 = col_character(),
## syst2 = col_character(),
## diast2 = col_character(),
## rz2 = col_character(),
## cuff = col_character(),
## arm = col_character(),
## bpcoder = col_character(),
## timebp = col_character()
## # ... with 14 more columns
## )
## See spec(...) for full column specifications.
monicaform048
## # A tibble: 11,833 x 75
## form versn centre runit serial numsur samunit dexam mbirth agegrp sex
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 04 8 10 02 071295 3 888 1805… 9.91e7 1 1
## 2 04 8 10 02 071302 3 888 2605… 9.91e7 1 1
## 3 04 8 10 02 071310 3 888 2005… 9.91e7 1 1
## 4 04 8 10 02 071320 3 888 2607… 9.91e7 1 1
## 5 04 8 10 02 071328 3 888 2107… 9.90e7 1 1
## 6 04 8 10 02 071334 3 888 1208… 9.91e7 1 1
## 7 04 8 10 02 071344 3 888 2906… 9.91e7 1 1
## 8 04 8 10 02 071351 3 888 0308… 9.91e7 1 1
## 9 04 8 10 02 071358 3 888 0111… 9.91e7 1 1
## 10 04 8 10 02 071366 3 888 2507… 9.90e7 1 1
## # … with 11,823 more rows, and 64 more variables: marit <dbl>,
## # edlevel <dbl>, school <chr>, cigs <dbl>, numcigs <chr>, daycigs <dbl>,
## # evercig <dbl>, stop <dbl>, iflyear <dbl>, maxcigs <dbl>, cigage <chr>,
## # cigarsm <dbl>, cigar <chr>, pipesm <dbl>, pipe <chr>, othersm <chr>,
## # hibp <dbl>, drugs <dbl>, bprecd <dbl>, high <dbl>, chdt <dbl>,
## # chrx <dbl>, chrecd <dbl>, asp <dbl>, menop <dbl>, agem <dbl>,
## # horm <dbl>, pill <dbl>, syst1 <chr>, diast1 <chr>, rz1 <chr>,
## # syst2 <chr>, diast2 <chr>, rz2 <chr>, cuff <chr>, arm <chr>,
## # bpcoder <chr>, timebp <chr>, rtemp <dbl>, chol <chr>, choldl <dbl>,
## # dchol <chr>, hdl <chr>, hdldl <chr>, dhdl <chr>, scn <chr>,
## # cotin <dbl>, carbmon <dbl>, height <dbl>, weight <chr>, waist <chr>,
## # hip <chr>, whcoder <chr>, oversion <dbl>, eage <dbl>, eageg <dbl>,
## # cohort1 <dbl>, cohort2 <dbl>, edtert1 <dbl>, edtert2 <dbl>,
## # systm <chr>, diastm <chr>, chola <chr>, hdla <chr>
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
mf = monicaform048 %>% mutate(
dexam=parse_date_time(gsub("(99)+", "", dexam), orders=c("dmy", "my", "y")),
mbirth=parse_date_time(gsub("99", "", substr(mbirth, 3, nchar(mbirth))), orders=c("my", "y")),
high = as.integer(high), # ever told by doc or provider that you have high cholesterol 1=yes, 2=no, 9=insufficient data
chdt = as.integer(chdt), # on a special diet for high chol? 1 = yes, 2=no, 3=uncertain, 8=never told they had high chol, 9=insufficient data
chrx = as.integer(chrx), # taking meds (in last two weeks) 1=yes, 2=no, 3=uncertain, 8=never told had high chol, 9=insufficient
dchol=dmy(dchol),
chol=as.integer(chol)/10, # Total serum cholesterol (mmol/l)
choldl=as.integer(choldl), # Total serum cholesterol (mg/dl)
hdl=as.integer(hdl)/100.,
hdldl=as.integer(hdldl),
height=as.integer(height)/100,
weight=as.integer(weight)*100, # recorded in 100 g units
eage=as.integer(eage), # Age on date of examination
eageg=as.integer(eageg)) # Age group on date of exam, but see data dict above, because coding varies by data collection center
## Warning: 3468 failed to parse.
mf <- mf %>% select(sex, eage, high, chdt, chrx, chol, choldl, weight, height)
mf
## # A tibble: 11,833 x 9
## sex eage high chdt chrx chol choldl weight height
## <dbl> <int> <int> <int> <int> <dbl> <int> <dbl> <dbl>
## 1 1 33 2 8 8 88.8 888 68800 1.7
## 2 1 32 2 8 8 4.7 888 88800 1.78
## 3 1 31 2 8 8 4.6 888 91000 1.63
## 4 1 32 2 8 8 4.7 888 68800 1.69
## 5 1 30 2 8 8 4.1 888 61200 1.74
## 6 1 30 2 8 8 5.3 888 68600 1.78
## 7 1 31 2 8 8 6.8 888 77600 1.86
## 8 1 33 2 8 8 99.9 888 999900 9.99
## 9 1 32 2 8 8 6.2 888 59600 1.73
## 10 1 30 2 8 8 4.5 888 66600 1.83
## # … with 11,823 more rows
mf <- mf %>%
filter(chol < 99.9, eage < 99, high < 9, chdt != 3, chdt != 9,
chrx != 3, chrx != 9,
choldl < 999, (chol < 88.8 | choldl < 888),
weight < 999900, height < 9.99) %>%
mutate(weight=weight/1000) %>%
mutate(bmi=weight/height^2) %>%
mutate(chol=ifelse(chol < 88.8, chol, choldl*0.02586)) %>%
mutate(hychol=ifelse(chol>=6.5, 1, 0), bmigrp=case_when(bmi<25 ~ 1, bmi<30~2, TRUE ~3)) %>%
mutate(chdt=ifelse(chdt==8, 2, 1), chrx=ifelse(chrx==8, 2, 1)) %>% # If either indicates high was no, assume they are not on diet or meds for high cholesterol
select(-choldl)
mf
## # A tibble: 9,006 x 11
## sex eage high chdt chrx chol weight height bmi hychol bmigrp
## <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 32 2 2 2 4.7 88.8 1.78 28.0 0 2
## 2 1 31 2 2 2 4.6 91 1.63 34.3 0 3
## 3 1 32 2 2 2 4.7 68.8 1.69 24.1 0 1
## 4 1 30 2 2 2 4.1 61.2 1.74 20.2 0 1
## 5 1 30 2 2 2 5.3 68.6 1.78 21.7 0 1
## 6 1 31 2 2 2 6.8 77.6 1.86 22.4 1 1
## 7 1 32 2 2 2 6.2 59.6 1.73 19.9 0 1
## 8 1 30 2 2 2 4.5 66.6 1.83 19.9 0 1
## 9 1 25 2 2 2 5.4 82.6 1.8 25.5 0 2
## 10 1 29 2 2 2 3.6 58.4 1.65 21.5 0 1
## # … with 8,996 more rows
write_csv(mf, "monica-chol.csv")
summary(mf)
## sex eage high chdt
## Min. :1.000 Min. :25.00 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:37.00 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :46.00 Median :2.000 Median :2.000
## Mean :1.516 Mean :45.84 Mean :1.851 Mean :1.843
## 3rd Qu.:2.000 3rd Qu.:55.00 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :64.00 Max. :2.000 Max. :2.000
## chrx chol weight height
## Min. :1.000 Min. : 2.500 Min. : 33.00 Min. :1.220
## 1st Qu.:2.000 1st Qu.: 4.888 1st Qu.: 63.00 1st Qu.:1.610
## Median :2.000 Median : 5.600 Median : 72.20 Median :1.670
## Mean :1.844 Mean : 5.677 Mean : 73.65 Mean :1.677
## 3rd Qu.:2.000 3rd Qu.: 6.400 3rd Qu.: 82.70 3rd Qu.:1.740
## Max. :2.000 Max. :17.300 Max. :193.40 Max. :2.000
## bmi hychol bmigrp
## Min. :15.07 Min. :0.0000 Min. :1.000
## 1st Qu.:22.95 1st Qu.:0.0000 1st Qu.:1.000
## Median :25.47 Median :0.0000 Median :2.000
## Mean :26.14 Mean :0.2346 Mean :1.719
## 3rd Qu.:28.57 3rd Qu.:0.0000 3rd Qu.:2.000
## Max. :64.08 Max. :1.0000 Max. :3.000
ggplot(mf, aes(x = eage, y = chol)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
ggplot(mf, aes(x = bmi, y = chol)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
ggplot(mf, aes(x = eage, y = weight)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(ggplot(mf, aes(x=bmi, fill=as.factor(sex))) + geom_histogram(binwidth = 1, alpha = 0.3, position="identity"))
plot_histogram <- function(df, feature) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
geom_density(alpha=0.3, fill="red") +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
print(plt)
}
plot_multi_histogram <- function(df, feature, label_column) {
plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
geom_density(alpha=0.7) +
geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
labs(x=feature, y = "Density")
plt + guides(fill=guide_legend(title=label_column))
}
plot_multi_histogram(mf %>% mutate(sex=as.factor(sex)), 'bmi', 'sex')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.